import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import copy


# 为了在jupyternotebook画图中显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 导入数据
train_data = pd.read_csv('./data/cs-training.csv')
train_data = train_data.iloc[:, 1:]
train_data.info()


# 缺失值处理，利用随机森林算法进行填充
mData = train_data.iloc[:, [5, 0, 1, 2, 3, 4, 6, 7, 8, 9]]  # 将缺失列 MonthlyIncome放在最前面,为了后面预测值得取值方便，并且重新赋予一个新的dataframe

# MonthlyIncome 非空
train_known = mData[mData.MonthlyIncome.notnull()].values

# MonthlyIncome 缺失
train_unknown = mData[mData.MonthlyIncome.isnull()].as_matrix()


# 利用随机森林预测缺失值
train_X = train_known[:, 1:]
train_y = train_known[:, 0]   # 0 列为MonthlyIncome缺失列

rfr = RandomForestRegressor(random_state=0, n_estimators=200, max_depth=3, n_jobs=-1)
rfr.fit(train_X, train_y)

predicted_y = rfr.predict(train_unknown[:, 1:]).round(0)  # 利用预测值填充原始数据中的缺失值
train_data.loc[train_data.MonthlyIncome.isnull(), 'MonthlyIncome'] = predicted_y

train_data = train_data.dropna()  # 舍弃缺失值
train_data = train_data.drop_duplicates()  # 舍弃重复值


# 异常值处理

# 查某些属性的异常值
train_box = train_data.iloc[:, [3, 7, 9]]
train_box.boxplot()

# 去除异常值
train_data = train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
train_data = train_data[train_data['age'] > 0]


# 切分数据集

X = train_data.iloc[:, 1:]
y = train_data.iloc[:, 0]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)

ntrain_data = pd.concat([train_y, train_X], axis=1)
ntest_data = pd.concat([test_y, test_X], axis=1)


# 单调分箱
# Y 中 1表示好样本
def mono_bin(Y, X, n=10):
    r = 0
    good = Y.sum()
    bad = Y.count() - good
    while np.abs(r) < 1: 
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index=True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)  
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate']/good)/((1-d3['rate'])/bad))
    d3['goodattribute'] = d3['sum']/good
    d3['badattribute'] = (d3['total']-d3['sum'])/bad
    iv = ((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_index(by='min')).reset_index(drop=True)
    woe = list(d4['woe'].round(3))
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n+1):
        qua = X.quantile(i/(n+1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))
    return d4, iv, cut, woe


# 对几个可分箱的属性变量进行分箱操作
x1_d, x1_iv, x1_cut, x1_woe = mono_bin(train_y, train_X.RevolvingUtilizationOfUnsecuredLines)
 
x2_d, x2_iv, x2_cut, x2_woe = mono_bin(train_y, train_X.age)
 
x4_d, x4_iv, x4_cut, x4_woe = mono_bin(train_y, train_X.DebtRatio)
 
x5_d, x5_iv, x5_cut, x5_woe = mono_bin(train_y, train_X.MonthlyIncome)


# 对几个有关次数值得属性进行分箱操作--- 离散值分箱
def woe_value(d1):
    d2 = d1.groupby('Bucket', as_index=True)
    good = train_y.sum()
    bad = train_y.count()-good
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate']/good)/((1-d3['rate'])/bad))
    d3['goodattribute'] = d3['sum']/good
    d3['badattribute'] = (d3['total']-d3['sum'])/bad
    iv = ((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_index(by='min')).reset_index(drop=True)
    woe = list(d4['woe'].round(3))
    return d4, iv, woe


# 特征分箱处理
d3 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d3['Bucket'] = d3['X']
d3_x1 = d3.loc[(d3['Bucket'] <= 0)]
d3_x1.loc[:, 'Bucket'] = "(-inf,0]"
 
d3_x2 = d3.loc[(d3['Bucket'] > 0) & (d3['Bucket'] <= 1)]
d3_x2.loc[:, 'Bucket'] = "(0,1]"
 
 
d3_x3 = d3.loc[(d3['Bucket'] > 1) & (d3['Bucket'] <= 3)]
d3_x3.loc[:, 'Bucket'] = "(1,3]"
 
 
d3_x4 = d3.loc[(d3['Bucket'] > 3) & (d3['Bucket'] <= 5)]
d3_x4.loc[:, 'Bucket'] = "(3,5]"
 
 
d3_x5 = d3.loc[(d3['Bucket'] > 5)]
d3_x5.loc[:, 'Bucket'] = "(5,+inf)"
d3 = pd.concat([d3_x1, d3_x2, d3_x3, d3_x4, d3_x5])
 
 
x3_d, x3_iv, x3_woe = woe_value(d3)
x3_cut = [float('-inf'), 0, 1, 3, 5, float('+inf')]


d6 = pd.DataFrame({"X": train_X['NumberOfOpenCreditLinesAndLoans'], "Y": train_y})
d6['Bucket'] = d6['X']

d6_x1 = d6.loc[(d6['Bucket'] <= 0)]
d6_x1.loc[:, 'Bucket'] = "(-inf,0]"

d6_x2 = d6.loc[(d6['Bucket'] > 0) & (d6['Bucket'] <= 1)]
d6_x2.loc[:, 'Bucket'] = "(0,1]"

d6_x3 = d6.loc[(d6['Bucket'] > 1) & (d6['Bucket'] <= 3)]
d6_x3.loc[:,'Bucket'] = "(1,3]"

d6_x4 = d6.loc[(d6['Bucket'] > 3) & (d6['Bucket'] <= 5)]
d6_x4.loc[:, 'Bucket'] = "(3,5]"

d6_x5 = d6.loc[(d6['Bucket'] > 5)]
d6_x5.loc[:, 'Bucket'] = "(5,+inf)"
d6 = pd.concat([d6_x1, d6_x2, d6_x3, d6_x4, d6_x5])

x6_d, x6_iv, x6_woe = woe_value(d6)
x6_cut = [float('-inf'), 1, 2, 3, 5, float('+inf')]


d7 = pd.DataFrame({"X": train_X['NumberOfTimes90DaysLate'], "Y": train_y})
d7['Bucket'] = d7['X']
d7_x1 = d7.loc[(d7['Bucket'] <= 0)]
d7_x1.loc[:, 'Bucket'] = "(-inf,0]"

d7_x2 = d7.loc[(d7['Bucket'] > 0) & (d7['Bucket'] <= 1)]
d7_x2.loc[:, 'Bucket'] = "(0,1]"

d7_x3 = d7.loc[(d7['Bucket'] > 1) & (d7['Bucket'] <= 3)]
d7_x3.loc[:, 'Bucket'] = "(1,3]"

d7_x4 = d7.loc[(d7['Bucket'] > 3) & (d7['Bucket'] <= 5)]
d7_x4.loc[:, 'Bucket'] = "(3,5]"

d7_x5 = d7.loc[(d7['Bucket'] > 5)]
d7_x5.loc[:, 'Bucket'] = "(5,+inf)"
d7 = pd.concat([d7_x1, d7_x2, d7_x3, d7_x4, d7_x5])

x7_d, x7_iv, x7_woe = woe_value(d7)
x7_cut = [float('-inf'), 0, 1, 3, 5, float('+inf')]


d8 = pd.DataFrame({"X": train_X['NumberRealEstateLoansOrLines'], "Y": train_y})
d8['Bucket'] = d8['X']
d8_x1 = d8.loc[(d8['Bucket'] <= 0)]
d8_x1.loc[:, 'Bucket'] = "(-inf,0]"
 
 
d8_x2 = d8.loc[(d8['Bucket'] > 0) & (d8['Bucket'] <= 1)]
d8_x2.loc[:, 'Bucket'] = "(0,1]"
 
 
d8_x3 = d8.loc[(d8['Bucket'] > 1) & (d8['Bucket'] <= 3)]
d8_x3.loc[:, 'Bucket'] = "(1,2]"
 
 
d8_x4 = d8.loc[(d8['Bucket'] > 2) & (d8['Bucket'] <= 3)]
d8_x4.loc[:, 'Bucket'] = "(2,3]"
 
 
d8_x5 = d8.loc[(d8['Bucket'] > 3)]
d8_x5.loc[:, 'Bucket'] = "(3,+inf)"
d8 = pd.concat([d8_x1, d8_x2, d8_x3, d8_x4, d8_x5])
 
 
x8_d, x8_iv, x8_woe = woe_value(d8)
x8_cut = [float('-inf'), 0, 1, 2, 3, float('+inf')]


d9 = pd.DataFrame({"X": train_X['NumberOfTime60-89DaysPastDueNotWorse'], "Y": train_y})
d9['Bucket'] = d9['X']
d9_x1 = d9.loc[(d9['Bucket'] <= 0)]
d9_x1.loc[:, 'Bucket'] = "(-inf,0]"
 
 
d9_x2 = d9.loc[(d9['Bucket'] > 0) & (d9['Bucket'] <= 1)]
d9_x2.loc[:, 'Bucket'] = "(0,1]"
 
 
d9_x3 = d9.loc[(d9['Bucket'] > 1) & (d9['Bucket'] <= 3)]
d9_x3.loc[:, 'Bucket'] = "(1,3]"
 

d9_x4 = d9.loc[(d9['Bucket']>3)]
d9_x4.loc[:, 'Bucket'] = "(3,+inf)"
d9 = pd.concat([d9_x1, d9_x2, d9_x3, d9_x4])
 
 
x9_d, x9_iv, x9_woe = woe_value(d9)
x9_cut = [float('-inf'), 0, 1, 3, float('+inf')]


d10 = pd.DataFrame({"X": train_X['NumberOfDependents'], "Y": train_y})
d10['Bucket'] = d10['X']
d10_x1 = d10.loc[(d10['Bucket'] <= 0)]
d10_x1.loc[:, 'Bucket'] = "(-inf,0]"
 
 
d10_x2 = d10.loc[(d10['Bucket'] > 0) & (d10['Bucket'] <= 1)]
d10_x2.loc[:, 'Bucket'] = "(0,1]"
 
d10_x3 = d10.loc[(d10['Bucket'] > 1) & (d10['Bucket'] <= 2)]
d10_x3.loc[:, 'Bucket'] = "(1,2]"

d10_x4 = d10.loc[(d10['Bucket'] > 2) & (d10['Bucket'] <= 3)]
d10_x4.loc[:, 'Bucket'] = "(2,3]"
 
 
d10_x5 = d10.loc[(d10['Bucket'] > 3) & (d10['Bucket'] <= 5)]
d10_x5.loc[:, 'Bucket'] = "(3,5]"
 
 
d10_x6 = d10.loc[(d10['Bucket'] > 5)]
d10_x6.loc[:, 'Bucket'] = "(5,+inf)"
d10 = pd.concat([d10_x1, d10_x2, d10_x3, d10_x4, d10_x5, d10_x6])
 
 
x10_d, x10_iv, x10_woe = woe_value(d10)
x10_cut = [float('-inf'), 0, 1, 2, 3, 5, float('+inf')]


# 特征筛选：多变量分析---用于剔除特征高度相关的某些特征
corr = train_data.corr()
xticks = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
yticks = list(corr.index)
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap='RdBu', ax=ax1, annot_kws={'size': 5,  'color': 'blue'})
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()


# 特征筛选：单变量分析---用于剔除特征高度相关的某些特征
informationValue = []
informationValue.append(x1_iv)
informationValue.append(x2_iv)
informationValue.append(x3_iv)
informationValue.append(x4_iv)
informationValue.append(x5_iv)
informationValue.append(x6_iv)
informationValue.append(x7_iv)
informationValue.append(x8_iv)
informationValue.append(x9_iv)
informationValue.append(x10_iv)
informationValue
 
index = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
index_num = range(len(index))
ax = plt.bar(index_num, informationValue, tick_label=index)
plt.show()


# 基于筛选后的特征，将特征WOE化
def trans_woe(var, var_name, x_woe, x_cut):
    woe_name = var_name + '_woe'
    for i in range(len(x_woe)):
        if i == 0:
            var.loc[(var[var_name] <= x_cut[i+1]), woe_name] = x_woe[i]
        elif (i > 0) and (i <= len(x_woe)-2):
            var.loc[((var[var_name] > x_cut[i]) & (var[var_name] <= x_cut[i+1])), woe_name] = x_woe[i]
        else:
            var.loc[(var[var_name] > x_cut[len(x_woe)-1]), woe_name] = x_woe[len(x_woe)-1]
    return var


x1_name = 'RevolvingUtilizationOfUnsecuredLines'
x2_name = 'age'
x3_name = 'NumberOfTime30-59DaysPastDueNotWorse'
x7_name = 'NumberOfTimes90DaysLate'
x9_name = 'NumberOfTime60-89DaysPastDueNotWorse'
 
train_X = trans_woe(train_X, x1_name, x1_woe, x1_cut)
train_X = trans_woe(train_X, x2_name, x2_woe, x2_cut)
train_X = trans_woe(train_X, x3_name, x3_woe, x3_cut)
train_X = trans_woe(train_X, x7_name, x7_woe, x7_cut)
train_X = trans_woe(train_X, x9_name, x9_woe, x9_cut)


# 建立模型

# 对训练集结果计算AUC ROC KS值
# 方法1： 直接用lr预测数值  -- 预测结果0.85

# test_X = trans_woe(test_X,x1_name,x1_woe,x1_cut)
# test_X = trans_woe(test_X,x2_name,x2_woe,x2_cut)
# test_X = trans_woe(test_X,x3_name,x3_woe,x3_cut)
# test_X = trans_woe(test_X,x7_name,x7_woe,x7_cut)
# test_X = trans_woe(test_X,x9_name,x9_woe,x9_cut)

# from sklearn.linear_model.logistic import LogisticRegression

# lr = LogisticRegression()
# lr.fit(train_X, train_y)

# # 注意predict 和predict_proba的区别
# resu = lr.predict_proba(test_X)
# resuLabel = lr.predict(test_X)

# print('predict_proba:',resu)
# print('-------')
# print('predict:',resuLabel)

# from sklearn.metrics import roc_curve,auc
# # X3=sm.add_constant(test_X)
# # resu=result.predict(X3)
# fpr,tpr,thershold=roc_curve(test_y,resu[:,1])
# rocauc=auc(fpr,tpr)
# plt.plot(fpr,tpr,'b',label='AUC=%0.2f'%rocauc)
# plt.legend()
# plt.plot([0,1],[0,1],'r--')
# plt.xlim([0,1])
# plt.ylim([0,1])
# plt.ylabel('TPR')
# plt.xlabel('FPR')
# plt.title('逻辑回归预测')
# plt.show()

# print ('KS:',max(tpr-fpr))


# 验证集测试
# 为模型增加常数项，即回归线在 y 轴上的截距
X1 = sm.add_constant(train_X)
logit = sm.Logit(train_y, X1)
result = logit.fit()
print(result.summary())

test_X = trans_woe(test_X, x1_name, x1_woe, x1_cut)
test_X = trans_woe(test_X, x2_name, x2_woe, x2_cut)
test_X = trans_woe(test_X, x3_name, x3_woe, x3_cut)
test_X = trans_woe(test_X, x7_name, x7_woe, x7_cut)
test_X = trans_woe(test_X, x9_name, x9_woe, x9_cut)


X3 = sm.add_constant(test_X)
resu = result.predict(X3)
fpr, tpr, thershold = roc_curve(test_y, resu)
rocauc = auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label='AUC=%0.2f' % rocauc)
plt.legend()
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()


# 建立评分卡
p = 20/np.log(2)    # 设置PDO为20
q = 600 - 20*np.log(20)/np.log(2)  # 设置定点分数为600


def get_score(coe, woe, factor):
    scores = []
    for w in woe:
        score = round(coe*w*factor, 0)
        scores.append(score)
    return scores


# x_coe表示利用逻辑回归拟合后的系数矩阵
x_coe = [2.6084, 0.6327, 0.5151, 0.5520, 0.5747, 0.4074]

baseScore = round(q + p * x_coe[0], 0)
x1_score = get_score(x_coe[1], x1_woe, p)

x1_score = get_score(x_coe[1], x1_woe, p)
x2_score = get_score(x_coe[2], x2_woe, p)
x3_score = get_score(x_coe[3], x3_woe, p)
x7_score = get_score(x_coe[4], x7_woe, p)
x9_score = get_score(x_coe[5], x9_woe, p)

score = [x1_score, x2_score, x3_score, x7_score, x9_score]


# 建立一个函数使得当输入x1,x2,x3,x7,x9的值时可以返回评分数
cut_t = [x1_cut, x2_cut, x3_cut, x7_cut, x9_cut]
score = [x1_score, x2_score, x3_score, x7_score, x9_score]


def compute_score(x):  # x为数组，包含x1,x2,x3,x7和x9的取值
    tot_score = baseScore
    cut_d = copy.deepcopy(cut_t)
    print(cut_d)
    for j in range(len(cut_d)):
        cut_d[j].append(x[j])
        cut_d[j].sort()      
        print(cut_d[j])
        for i in range(len(cut_d[j])):
            if cut_d[j][i] == x[j]:
                print(score[j][i-1])  # 注意一个问题，边界值重复加了两遍，是否需要？
                tot_score = score[j][i-1] + tot_score
    return tot_score


# 输入一个根据筛选后的特征类似的值，并得到最后的评分结果
x_score = [0.3, 44, 3, 3, 5]
compute_score(x_score)
